import sys
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sys.path.append("./utils")
import loader
import round_classifier
from Parser import Parser
all_data = loader.load_dataset()
all_columns = all_data.columns
len(all_data.columns)
97
parser = Parser()
# treat_data = all_data.copy()
treat_data = parser.classify_weapons(all_data.copy())
# treat_data = parser.create_round_winner_columns(treat_data.copy())
# Define rounds pistol (ocorrem no começo do jogo e no round 16)
treat_data["pistol_round"] = treat_data.apply(
round_classifier.define_pistol_round, axis=1
)
# Defining eco rounds
treat_data["ct_eco"] = treat_data.apply(round_classifier.define_full_ct_eco, axis=1)
treat_data["t_eco"] = treat_data.apply(round_classifier.define_full_t_eco, axis=1)
# Defining force buy rounds
treat_data["ct_force"] = treat_data.apply(
round_classifier.define_force_ct_round, axis=1
)
treat_data["t_force"] = treat_data.apply(round_classifier.define_force_t_round, axis=1)
# Selecting columns
normal_columns = all_columns[0:16].tolist()
made_columns_t = [
"t_main_rifle",
"t_sec_rifle",
"t_force_weapons",
"t_weak_pistols",
"t_strong_pistols",
"t_granades",
"t_weapon_awp",
"t_eco",
"t_force",
]
made_columns_ct = [
"ct_main_rifle",
"ct_sec_rifle",
"ct_force_weapons",
"ct_weak_pistols",
"ct_strong_pistols",
"ct_granades",
"ct_weapon_awp",
"ct_eco",
"ct_force",
]
extra_columns = [
"pistol_round",
"round_winner",
]
selected_columns = normal_columns + made_columns_ct + made_columns_t + extra_columns
# Getting only columns selected columns
final_df = treat_data[selected_columns]
# checando nans
final_df
| time_left | ct_score | t_score | map | bomb_planted | ct_health | t_health | ct_armor | t_armor | ct_money | ... | t_sec_rifle | t_force_weapons | t_weak_pistols | t_strong_pistols | t_granades | t_weapon_awp | t_eco | t_force | pistol_round | round_winner | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 175.00 | 0.0 | 0.0 | de_dust2 | False | 500.0 | 500.0 | 0.0 | 0.0 | 4000.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 1 | CT |
| 1 | 156.03 | 0.0 | 0.0 | de_dust2 | False | 500.0 | 500.0 | 400.0 | 300.0 | 600.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0 | 0 | 1 | CT |
| 2 | 96.03 | 0.0 | 0.0 | de_dust2 | False | 391.0 | 400.0 | 294.0 | 200.0 | 750.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0 | 0 | 1 | CT |
| 3 | 76.03 | 0.0 | 0.0 | de_dust2 | False | 391.0 | 400.0 | 294.0 | 200.0 | 750.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 1 | CT |
| 4 | 174.97 | 1.0 | 0.0 | de_dust2 | False | 500.0 | 500.0 | 192.0 | 0.0 | 18350.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 0 | 0 | CT |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 122405 | 15.41 | 11.0 | 14.0 | de_train | True | 200.0 | 242.0 | 195.0 | 359.0 | 100.0 | ... | 0.0 | 0.0 | 2.0 | 0.0 | 3.0 | 1.0 | 0 | 0 | 0 | T |
| 122406 | 174.93 | 11.0 | 15.0 | de_train | False | 500.0 | 500.0 | 95.0 | 175.0 | 11500.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 1.0 | 0 | 0 | 0 | T |
| 122407 | 114.93 | 11.0 | 15.0 | de_train | False | 500.0 | 500.0 | 495.0 | 475.0 | 1200.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 14.0 | 1.0 | 0 | 0 | 0 | T |
| 122408 | 94.93 | 11.0 | 15.0 | de_train | False | 500.0 | 500.0 | 495.0 | 475.0 | 1200.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 12.0 | 1.0 | 0 | 0 | 0 | T |
| 122409 | 74.93 | 11.0 | 15.0 | de_train | False | 375.0 | 479.0 | 395.0 | 466.0 | 1100.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 8.0 | 1.0 | 0 | 0 | 0 | T |
122410 rows × 36 columns
len(final_df.columns)
36
def round_winner_binnary(dataframe):
round_winner_bin = []
for i in range(0, len(dataframe.index)):
if dataframe["round_winner"].iloc[i] == "CT":
round_winner_bin.append(1)
elif dataframe["round_winner"].iloc[i] == "T":
round_winner_bin.append(0)
dataframe["roundwinner_bin"] = round_winner_bin
round_winner_binnary(final_df)
C:\Users\luca\AppData\Local\Temp/ipykernel_26344/2013146941.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy dataframe["roundwinner_bin"] = round_winner_bin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
labels = final_df["roundwinner_bin"].copy()
cat_attribs = [
"map",
"bomb_planted",
"ct_eco",
"ct_force",
"t_eco",
"t_force",
"pistol_round",
"ct_helmets",
"t_helmets",
"ct_defuse_kits",
"ct_players_alive",
"t_players_alive",
"ct_main_rifle",
"ct_sec_rifle",
"ct_force_weapons",
"ct_weak_pistols",
"ct_strong_pistols",
"ct_weapon_awp",
"t_main_rifle",
"t_sec_rifle",
"t_force_weapons",
"t_weak_pistols",
"t_strong_pistols",
"t_weapon_awp",
]
for col in cat_attribs:
final_df[col] = final_df[col].astype("category")
C:\Users\luca\AppData\Local\Temp/ipykernel_26344/61044880.py:29: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
final_df[col] = final_df[col].astype("category")
# meu_imputer = SimpleImputer(strategy="median")
meu_scaler = StandardScaler()
num_pipeline = Pipeline(
[
("std_scaler", meu_scaler),
]
)
meu_one_hot_encoder = OneHotEncoder(sparse=False)
cat_pipeline = Pipeline(
[
("cat_encoder", meu_one_hot_encoder),
]
)
dataset_num = final_df.drop(
labels=[
"map",
"bomb_planted",
"ct_eco",
"ct_force",
"t_eco",
"t_force",
"pistol_round",
"ct_helmets",
"t_helmets",
"ct_defuse_kits",
"ct_players_alive",
"t_players_alive",
"ct_main_rifle",
"ct_sec_rifle",
"ct_force_weapons",
"ct_weak_pistols",
"ct_strong_pistols",
"ct_weapon_awp",
"t_main_rifle",
"t_sec_rifle",
"t_force_weapons",
"t_weak_pistols",
"t_strong_pistols",
"t_weapon_awp",
"round_winner",
"roundwinner_bin",
],
axis=1,
)
num_attribs = list(dataset_num)
cat_attribs = [
"map",
"bomb_planted",
"ct_eco",
"ct_force",
"t_eco",
"t_force",
"pistol_round",
"ct_helmets",
"t_helmets",
"ct_players_alive",
"t_players_alive",
"ct_force_weapons",
"ct_weak_pistols",
"ct_strong_pistols",
"ct_weapon_awp",
"t_main_rifle",
"t_sec_rifle",
"t_force_weapons",
"t_weak_pistols",
"t_strong_pistols",
"t_weapon_awp",
]
full_pipeline = ColumnTransformer(
[
("num", num_pipeline, num_attribs),
("cat", cat_pipeline, cat_attribs),
]
)
x_raw = final_df.drop(columns=["round_winner", "roundwinner_bin"])
dataset_prepared = full_pipeline.fit_transform(x_raw)
from sklearn.model_selection import train_test_split
seed = 42
X = dataset_prepared
X_train, X_test, y_train, y_test = train_test_split(
X, labels, test_size=0.333, random_state=seed
)
X_t, X_te, y_t, y_te = train_test_split(
x_raw, labels, test_size=0.333, random_state=seed
)
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(
n_estimators=350, n_jobs=-1, random_state=seed, max_features="auto"
)
rf_classifier.fit(X_train, y_train)
RandomForestClassifier(n_estimators=350, n_jobs=-1, random_state=42)
from sklearn.model_selection import cross_val_score
cross_val_score(
rf_classifier,
X_train,
y_train,
cv=3,
scoring="accuracy",
n_jobs=-1,
)
array([0.84020429, 0.85034539, 0.84449752])
from sklearn.linear_model import SGDClassifier
# Existe aleatoriedade dentro do SGDClassifier, por isso o argumento
RANDOM_SEED = 420
sgd_clf = SGDClassifier(
max_iter=500,
tol=1e-3,
random_state=RANDOM_SEED,
)
sgd_clf.fit(X_train, y_train)
y_pred_sgd = rf_classifier.predict(X_test)
y_pred_sgd
array([1, 0, 1, ..., 0, 1, 0], dtype=int64)
from sklearn.model_selection import cross_val_score
cross_val_score(
sgd_clf,
X_train,
y_train,
cv=3,
scoring="accuracy",
n_jobs=-1,
)
array([0.74775867, 0.74875073, 0.74870476])
from sklearn.naive_bayes import GaussianNB
# Create a Gaussian Classifier
gaussianN = GaussianNB()
# Train the gaussianN using the training sets
gaussianN.fit(X_train, y_train)
# Predict Output
predicted = gaussianN.predict(X_test) # 0:Overcast, 2:Mild
predicted
array([1, 1, 1, ..., 0, 0, 0], dtype=int64)
cross_val_score(
gaussianN,
X_train,
y_train,
cv=3,
scoring="accuracy",
n_jobs=-1,
)
array([0.68334803, 0.68639771, 0.68767224])
from sklearn import metrics
def get_results(models, X_test, y_test):
results = []
for model in models:
y_pred = model["model"].predict(X_test)
result = {
"Name": model["name"],
"Accuracy": metrics.accuracy_score(y_test, y_pred),
"Precision": metrics.precision_score(y_test, y_pred),
"Recall": metrics.recall_score(y_test, y_pred),
}
results.append(result)
_df = pd.DataFrame(results)
_df.set_index("Name", inplace=True)
return _df
models = [
{
"name": "Random Forest Classifier",
"model": rf_classifier,
},
{
"name": "SGD Classifier",
"model": sgd_clf,
},
{
"name": "Naive Bayes",
"model": gaussianN,
},
]
df_test_results = get_results(models, X_test, y_test)
df_test_results
| Accuracy | Precision | Recall | |
|---|---|---|---|
| Name | |||
| Random Forest Classifier | 0.871280 | 0.866120 | 0.873586 |
| SGD Classifier | 0.749332 | 0.714572 | 0.817330 |
| Naive Bayes | 0.711797 | 0.671376 | 0.812148 |
grouped_df_rf = df_test_results.query("Name == 'Random Forest Classifier'")
grouped_df_sgd = df_test_results.query("Name == 'SGD Classifier'")
grouped_df_nb = df_test_results.query("Name == 'Naive Bayes'")
import plotly.graph_objects as go
fig = go.Figure(
layout=go.Layout(
title="Accurácia Dos Modelos",
)
)
fig.add_trace(
go.Bar(
x=grouped_df_rf["Accuracy"],
name="Accuracy - Random Forest Classifier",
),
)
fig.add_trace(
go.Bar(
x=grouped_df_nb["Accuracy"],
name="Accuracy - SGD Classifier",
)
)
fig.add_trace(
go.Bar(
x=grouped_df_sgd["Accuracy"],
name="Accuracy - Naive Bayes",
)
)
fig.show()
fig = go.Figure(
layout=go.Layout(
title="Precisão Dos Modelos",
)
)
fig.add_trace(
go.Bar(
x=grouped_df_rf["Precision"],
name="Precision - Random Forest Classifier",
)
)
fig.add_trace(
go.Bar(
x=grouped_df_nb["Precision"],
name="Precision - SGD Classifier",
)
)
fig.add_trace(
go.Bar(
x=grouped_df_sgd["Precision"],
name="Precision - Naive Bayes",
)
)
fig.show()
fig = go.Figure(
layout=go.Layout(
title="Recall Dos Modelos",
)
)
fig.add_trace(
go.Bar(
x=grouped_df_rf["Recall"],
name="Recall - Random Forest Classifier",
)
)
fig.add_trace(
go.Bar(
x=grouped_df_nb["Recall"],
name="Recall - SGD Classifier",
)
)
fig.add_trace(
go.Bar(
x=grouped_df_sgd["Recall"],
name="Recall - Naive Bayes",
)
)
fig.show()
import numpy as np
import plotly.express as px
def get_feature_importances(model):
cat_cols = full_pipeline.transformers_[1][1]["cat_encoder"].get_feature_names(
cat_attribs
)
num_cols = num_attribs
cols = np.concatenate((num_cols, cat_cols))
df_feat_importances = pd.DataFrame(
{"Stats": cols, "FI": model.feature_importances_}
).sort_values(by="FI", ascending=0)
figLeast = px.bar_polar(
df_feat_importances.iloc[-12:],
r="FI",
theta="Stats",
color="Stats",
template="plotly_dark",
color_discrete_sequence=px.colors.sequential.Plasma_r,
)
figMost = px.bar_polar(
df_feat_importances.iloc[:12],
r="FI",
theta="Stats",
color="Stats",
template="plotly_dark",
color_discrete_sequence=px.colors.sequential.Plasma_r,
)
return (figLeast, figMost)
figs = [
# Other models dont have feature importances
get_feature_importances(rf_classifier),
]
for figLeast, figMost in figs:
figMost.show()
figLeast.show()
A medida de score definida para o modelo foi a "roc_auc", que mede a capacidade do modelo de distinguir as classes (round winner). Quanto maior o AUC, melhor a capacidade do modelo de prever outputs positivos e negativos, ou seja, melhor a capacidade do modelo de distinguir se uma determinada equipe vai vencer aquela rodada ou não.
from sklearn.model_selection import GridSearchCV
forest_clf = RandomForestClassifier(random_state=RANDOM_SEED)
n_estimators = [100, 200, 250, 300, 350, 400, 450, 500]
max_features = [3, 5, 8]
param_grid = [
{
"n_estimators": n_estimators,
"max_features": max_features,
},
]
grid_search = GridSearchCV(
forest_clf, # Modelo
param_grid, # Grid
cv=5, # Partições de C.V.
scoring="roc_auc", # accuracy, recall, precision
return_train_score=True,
n_jobs=-1,
)
grid_search.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=420),
n_jobs=-1,
param_grid=[{'max_features': [3, 5, 8],
'n_estimators': [100, 200, 250, 300, 350, 400, 450,
500]}],
return_train_score=True, scoring='roc_auc')
# Testing best estimators
grid_search.best_estimator_
RandomForestClassifier(max_features=3, n_estimators=500, random_state=420)
def plot_grid_search(
cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2
):
# Get Test Scores Mean and std for each grid search
scores_mean = cv_results["mean_test_score"]
scores_mean = np.array(scores_mean).reshape(len(grid_param_2), len(grid_param_1))
scores_sd = cv_results["std_test_score"]
scores_sd = np.array(scores_sd).reshape(len(grid_param_2), len(grid_param_1))
# Plot Grid search scores
_, ax = plt.subplots(1, 1)
# Param1 is the X-axis, Param 2 is represented as a different curve (color line)
for idx, val in enumerate(grid_param_2):
ax.plot(
grid_param_1,
scores_mean[idx, :],
"-o",
label=name_param_2 + ": " + str(val),
)
ax.set_title("Grid Search Scores", fontsize=20, fontweight="bold")
ax.set_xlabel(name_param_1, fontsize=16)
ax.set_ylabel("CV Average Score", fontsize=16)
ax.legend(loc="best", fontsize=15)
ax.grid("on")
# Calling Method
plot_grid_search(
grid_search.cv_results_, n_estimators, max_features, "N Estimators", "Max Features"
)